Libraries

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(readxl)
library(stringr)
library(ggplot2)
library(RColorBrewer)
library(stringr)

Figure out which of these packages is already installed

# Store all installed packages
ya_installed <- library()$results[,1]

# Check whether required packages are already installed and grab only those that still need installation
need_install<-my_packages[!(my_packages %in% ya_installed)]

#install required packages
lapply({need_install}, install.packages, character.only = TRUE)

Now, load only unloaded packages

# Store all installed packages
ya_loaded <- (.packages())

# Check whether required packages are already installed and grab only those that still need installation
need_load<-my_packages[!(my_packages %in% ya_loaded)]

# Load required packages
lapply(need_load, require, character.only = TRUE)
## Loading required package: bipartite
## Loading required package: sna
## Loading required package: statnet.common
## 
## Attaching package: 'statnet.common'
## The following objects are masked from 'package:base':
## 
##     attr, order
## Loading required package: network
## 
## 'network' 1.19.0 (2024-12-08), part of the Statnet Project
## * 'news(package="network")' for changes since last version
## * 'citation("network")' for citation information
## * 'https://statnet.org' for help, support, and other information
## sna: Tools for Social Network Analysis
## Version 2.8 created on 2024-09-07.
## copyright (c) 2005, Carter T. Butts, University of California-Irvine
##  For citation information, type citation("sna").
##  Type help(package="sna") to get started.
## Loading required package: vegan
## Loading required package: permute
## Loading required package: lattice
##  This is bipartite 2.21.
##  For latest changes see versionlog in ?"bipartite-package". For citation see: citation("bipartite").
##  Have a nice time plotting and analysing two-mode networks.
## 
## Attaching package: 'bipartite'
## The following object is masked from 'package:vegan':
## 
##     nullmodel
## Loading required package: igraph
## 
## Attaching package: 'igraph'
## The following object is masked from 'package:bipartite':
## 
##     strength
## The following object is masked from 'package:vegan':
## 
##     diversity
## The following object is masked from 'package:permute':
## 
##     permute
## The following objects are masked from 'package:sna':
## 
##     betweenness, bonpow, closeness, components, degree, dyad.census,
##     evcent, hierarchy, is.connected, neighborhood, triad.census
## The following objects are masked from 'package:network':
## 
##     %c%, %s%, add.edges, add.vertices, delete.edges, delete.vertices,
##     get.edge.attribute, get.edges, get.vertex.attribute, is.bipartite,
##     is.directed, list.edge.attributes, list.vertex.attributes,
##     set.edge.attribute, set.vertex.attribute
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union

Read in Data

Read in the csv files

athletes_full<-read.csv("athletes.csv")
athletes <- athletes_full %>%
  select(name, country, disciplines, events, birth_date)

medals_full <-read.csv("medallists.csv")
medals <- medals_full %>%
  select(name, medal_type, medal_code, country, discipline, event)

athletes$disciplines <- str_replace_all(athletes$disciplines, "\\['", "")
athletes$disciplines <- str_replace_all(athletes$disciplines, "'\\]", "")
athletes$disciplines <- str_replace_all(athletes$disciplines, "\\[\"", "")
athletes$disciplines <- str_replace_all(athletes$disciplines, "\"\\]", "")

athletes$events <- str_replace_all(athletes$events, "\\['", "")
athletes$events <- str_replace_all(athletes$events, "'\\]", "")
athletes$events <- str_replace_all(athletes$events, "\\[\"", "")
athletes$events <- str_replace_all(athletes$events, "\"\\]", "")

medals$medal_type <- str_replace_all(medals$medal_type, " Medal", "")

Creating the bipartite graphs

#Creating webID
athletes_webID <- data.frame(matrix("olympicweb", nrow = nrow(athletes), ncol = 1))
new_athletes <- cbind(athletes, athletes_webID)
colnames(new_athletes)<- c("name", "country", "disciplines", "events", "birth_date", "webID")

Plotting network interactions between two node types

#Generate the graph object using frame2webs()
web_athletes <- frame2webs(new_athletes, varnames = c("country", "disciplines", "webID"), type.out = "list", emptylist = TRUE)

Bipartite Graph of new_athletes

#creating a color vector
cols1 <-c( '#8214a0', '#005ac8', '#00a0fa', '#fa78fa', '#14d2dc', '#aa0a3c', '#fa7850', '#0ab45a', '#f0f032', '#a0fa82', '#fae6be') 
#plotting the two-dimensional matrix to a bipartite graph
plotweb(web_athletes$"olympicweb", method='cca', labsize=1.2, x.lim=c(0,4), y.lim=c(-0.6,2.8), text.rot=90, col.interaction=cols1, bor.col.interaction=cols1) 
title("Athlete and Country Bipartite Network")

#PNG
png("img/athlete_country_bipartite.png", width = 3000, height = 1200, res = 300)
plotweb(web_athletes$"olympicweb", method = 'cca', labsize=0.6, y.lim=c(-0.5,2), text.rot = 90, 
        col.interaction = cols1, bor.col.interaction = cols1)
dev.off()
## quartz_off_screen 
##                 2
athletes_by_country <- athletes %>%
  count(country, name = "num_athletes")

# countrys with only 1 or 2 athletes
small_country <- athletes_by_country %>%
  filter(num_athletes <= 15) %>%
  pull(country)

# Update country in original data
athletes_cleaned <- athletes %>%
  mutate(country = ifelse(country %in% small_country, "Other Countries", country))

#Creating webID
athletes_cleaned_webID <- data.frame(matrix("olympicweb", nrow = nrow(athletes_cleaned), ncol = 1))
new_athletes_cleaned <- cbind(athletes_cleaned, athletes_cleaned_webID)
colnames(new_athletes_cleaned)<- c("name", "country", "disciplines", "events", "birth_date", "webID")

Plotting network interactions between two node types

#Generate the graph object using frame2webs()
web_cleaned_athletes <- frame2webs(new_athletes_cleaned, varnames = c("country", "disciplines", "webID"), type.out = "list", emptylist = TRUE)

Bipartite Graph of new_athletes

#creating a color vector
cols1 <-c( '#8214a0', '#005ac8', '#00a0fa', '#fa78fa', '#14d2dc', '#aa0a3c', '#fa7850', '#0ab45a', '#f0f032', '#a0fa82', '#fae6be') 
#plotting the two-dimensional matrix to a bipartite graph
plotweb(web_cleaned_athletes$"olympicweb", method='cca', labsize=1.2, x.lim=c(0,4), y.lim=c(-0.6,2.8), text.rot=90, col.interaction=cols1, bor.col.interaction=cols1)      
title("Simplified Athlete and Country Bipartite Network")

<img src=“olympic_network_analysis_files/figure-html/bipartite graph visualization with”Other Countries”-1.png” alt=“Simplified bipartite network where countries with 15 or fewer athletes are grouped into an ‘Other Countries’ node to reduce visual clutter.” width=“672” />

# Save as PNG
png("img/simple_athlete_country_bipartite.png", width = 3000, height = 1200, res = 300)
plotweb(web_cleaned_athletes$"olympicweb", method='cca', labsize=0.6, y.lim=c(-0.2,2), text.rot=90,
        col.interaction=cols1, bor.col.interaction=cols1)      

dev.off()
## quartz_off_screen 
##                 2
bottom_20_countrys <- athletes_by_country %>%
  slice_min(num_athletes, n = 20) %>%
  pull(country)
athletes_bottom20 <- athletes %>%
  filter(country %in% bottom_20_countrys)
athletes_bottom20_webID <- data.frame(matrix("olympicweb", nrow = nrow(athletes_bottom20), ncol = 1))
new_athletes_bottom20 <- cbind(athletes_bottom20, athletes_bottom20_webID)
colnames(new_athletes_bottom20)<- c("name", "country", "disciplines", "events", "birth_date", "webID")
web_new_athletes_bottom20 <- frame2webs(new_athletes_bottom20, varnames = c("country", "disciplines", "webID"), type.out = "list", emptylist = TRUE)

web_mat <- web_new_athletes_bottom20$"olympicweb"

par(mar = c(5, 5, 5, 5))  # Extra margins for label room
plotPAC(
  web_mat,
  scaling = 2,
  fill.col = rgb(0, 110/255, 130/255, 0.5),
  arrow.col = rgb(0, 110/255, 130/255, 0.5)
)

PAC network showing connections between the bottom 20 countries by number of athletes and the disciplines they participated in. It shows overlapping disciplines among smaller delegations.

rownames(new_athletes_bottom20$"olympicweb")
## NULL

Bhutan and Chad share Archery and Athletics, and Sao Tome and Principe and Chad share Athletics and Judo

png("img/unimodal.png", width = 4000, height = 2000, res = 300)
set.seed(2)
par(mar = c(5, 5, 5, 5))  # Extra margins for label room
plotPAC(
  web_mat,
  scaling = 2,
  fill.col = rgb(0, 110/255, 130/255, 0.5),
  arrow.col = rgb(0, 110/255, 130/255, 0.5)
)
rownames(new_athletes_bottom20$"olympicweb")
## NULL
dev.off()
## quartz_off_screen 
##                 2
athletes_by_country <- athletes %>%
  count(country, name = "num_athletes") %>%
  arrange(desc(num_athletes))

theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
## List of 1
##  $ axis.text.x:List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : num 1
##   ..$ vjust        : num 0.5
##   ..$ angle        : num 90
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi FALSE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  - attr(*, "class")= chr [1:2] "theme" "gg"
##  - attr(*, "complete")= logi FALSE
##  - attr(*, "validate")= logi TRUE
top_countrys <- athletes_by_country %>% slice_max(num_athletes, n = 20)

ggplot(top_countrys, aes(x = reorder(country, -num_athletes), y = num_athletes)) +
  geom_bar(stat = "identity", fill = "green") +
  labs(title = "Number of Athletes per Country/country",
       x = "Country/country",
       y = "Number of Athletes") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))

Bar chart showing the top 20 countries with the highest number of athletes. Shows countries like the United States, China, and France as top participants.

# Save as PNG
png("img/num_athletes_vs_top_country.png", width = 3000, height = 1200, res = 300)
ggplot(top_countrys, aes(x = reorder(country, -num_athletes), y = num_athletes)) +
  geom_bar(stat = "identity", fill = "green") +
  labs(title = "Number of Athletes per Country/country",
       x = "Country/country",
       y = "Number of Athletes") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
dev.off()
## quartz_off_screen 
##                 2
bottom_countrys <- athletes_by_country %>%
  slice_min(num_athletes, n = 20)

ggplot(bottom_countrys, aes(x = reorder(country, num_athletes), y = num_athletes)) +
  geom_bar(stat = "identity", fill = "green") +
  labs(title = "Bottom 20 Countries by Number of Athletes",
       x = "Country/country",
       y = "Number of Athletes") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))

Bar chart showing the bottom 20 countries by number of athletes sent to the Olympics. Countries with only one, two, or three representatives are included.

# Save as PNG
png("img/num_athletes_vs_bottom_country.png", width = 3000, height = 1200, res = 300)
ggplot(bottom_countrys, aes(x = reorder(country, num_athletes), y = num_athletes)) +
  geom_bar(stat = "identity", fill = "green") +
  labs(title = "Bottom 20 Countries by Number of Athletes",
       x = "Country/country",
       y = "Number of Athletes") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
dev.off()
## quartz_off_screen 
##                 2
name_medal <- as.matrix(cbind(medals$name, medals$medal_type))
japan_medalist <- as.matrix(cbind(name_medal,medals$country))
japan_medalist<-as.data.frame(japan_medalist)
japan_medalist <- japan_medalist %>%
  filter(V3 == "Japan") %>%
  select(V1, V2)
japan_medalist <- as.matrix(japan_medalist)
japan_medalist.g <- graph_from_edgelist(japan_medalist, directed = FALSE)
V(japan_medalist.g)$type <- bipartite_mapping(japan_medalist.g)$type
bipart_data_japan <- as_biadjacency_matrix(japan_medalist.g)
#Creating sociomatrix for medalist
japan_medalist.mat <- bipart_data_japan %*% t(bipart_data_japan) 
#show athletes_medalist.mat matrix
diag(japan_medalist.mat) <- NA

Creating the network graph of medalists

# Create an undirected graph from the sociomatrix of the dcs instructor network. 
japan_medalist.g <- graph_from_adjacency_matrix(japan_medalist.mat, mode="undirected")

Plotting the network graph of courses

set.seed(1)
japan_clusters <- cluster_louvain(japan_medalist.g)
V(japan_medalist.g)$cluster <- japan_clusters$membership
jp_medal_colors <- case_when(
  japan_clusters$membership == 1 ~ "darkgreen",
  japan_clusters$membership == 2 ~ "gold",
  japan_clusters$membership == 3 ~ "grey"
)


#layout setting
la <- layout_with_fr(japan_medalist.g, niter = 1000000, area = vcount(japan_medalist.g)^8)
## Warning: The `area` argument of `layout_with_fr()` is deprecated as of igraph 0.8.0.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
#edge list setting
e.wt <- edge_attr(japan_medalist.g, "weight")
v.wt <- strength(japan_medalist.g, mode = "all")
v.size <- v.wt / max(v.wt) * 10  # scale node sizes between 0 and 30
#plot instructor network
plot(japan_medalist.g, 
     layout=la,
     vertex.size=v.size,
     edge.width=e.wt,
     vertex.color = jp_medal_colors,
     vertex.label= V(japan_medalist.g)$name,
     vertex.label.cex = 0.8
     )

Japanese medalist network colored by medal type. Includes tight clustering of athletes with similar medal results, and Oka Shinnosuke as a central connecting node. Saving as png

png("img/japan_medalist_cluster.png", width = 3000, height = 3000, res = 300)
set.seed(3)
plot(japan_medalist.g, 
     layout=la,
     vertex.size=v.size,
     edge.width=e.wt,
     vertex.color = jp_medal_colors,
     vertex.label= V(japan_medalist.g)$name,
     vertex.label.cex = 0.8
     )
dev.off()
## quartz_off_screen 
##                 2

Statistical Analysis

# Edge Density
edge_density(japan_medalist.g)
## [1] 0.4505176
# → Value close to 1 = very interconnected (like tight team clusters)
# → Value close to 0 = sparse network
# 0.4505176

# Transitivity (clustering coefficient)
transitivity(japan_medalist.g)
## [1] 0.8407694
# → Closer to 1 = strong triadic closure (teammates or tightly knit clusters)
# → Closer to 0 = more linear, disconnected structure
#0.8407694 meaning they have strong triadic closure, and this shows that there were a lot of teams won the medals 

# Betweenness Centrality
btw <- betweenness(japan_medalist.g)
head(sort(btw, decreasing = TRUE), 5)
##  OKA Shinnosuke TSUNODA Natsumi       KANO Koki      ABE Hifumi NAGASE Takanori 
##        446.8571        125.5536        125.5536        125.5536        125.5536
# → Athletes with high values are structural bridges in the network
# → Perfect for identifying connectors like OKA Shinnosuke
#OKA Shinnosuke TSUNODA Natsumi       KANO Koki      ABE Hifumi NAGASE Takanori 
#      446.8571        125.5536        125.5536        125.5536        125.5536 
name_medal <- as.matrix(cbind(medals$name, medals$medal_type))
g_medal <- graph_from_edgelist(name_medal, directed = FALSE)
V(g_medal)$type <- bipartite_mapping(g_medal)$type
# Create biadjacency matrix
bipart_data <- as_biadjacency_matrix(g_medal)

# Project to name–name matrix
medalist_mat <- bipart_data %*% t(bipart_data)
diag(medalist_mat) <- 0

# Create graph
medalist_g <- graph_from_adjacency_matrix(medalist_mat, mode = "undirected", weighted = TRUE)
clusters <- cluster_louvain(medalist_g)
V(medalist_g)$cluster <- clusters$membership
# Define colors based on cluster membership and what each cluster represents
medal_colors <- case_when(
  clusters$membership == 1 ~ "gold",
  clusters$membership == 2 ~ "grey",
  clusters$membership == 3 ~ "darkgreen"
)
layout_medal <- layout_with_fr(medalist_g)
plot(medalist_g,
     layout = layout_medal,
     vertex.size = 5,
     vertex.label = NA,
     vertex.color = medal_colors,
     edge.width = E(medalist_g)$weight / max(E(medalist_g)$weight) * 5,
     main = "Medalist Clustering Based on Shared Medal Types")

Network graph showing medalist clustering by medal type, such as gold, silver, and bronze. Athletes are grouped based on the kind of medal they received.

png("img/medalist_cluster.png", width = 3000, height = 3000, res = 300)
set.seed(1)
layout_medal <- layout_with_fr(medalist_g)
plot(medalist_g,
     layout = layout_medal,
     vertex.size = 5,
     vertex.label = NA,
     vertex.color = medal_colors,
     edge.width = E(medalist_g)$weight / max(E(medalist_g)$weight) * 5,
     main = "Medalist Clustering Based on Shared Medal Types")
dev.off()
## quartz_off_screen 
##                 2
country_discipline <- as.matrix(cbind(athletes$country, athletes$disciplines))

g_cd <- graph_from_edgelist(country_discipline, directed = FALSE)
V(g_cd)$type <- bipartite_mapping(g_cd)$type
bipart_cd <- as_biadjacency_matrix(g_cd)
country_mat <- bipart_cd %*% t(bipart_cd)
diag(country_mat) <- 0
g_country <- graph_from_adjacency_matrix(country_mat, mode = "undirected", weighted = TRUE)
layout_country <- layout_with_fr(g_country)
plot(g_country,
     layout = layout_country,
     vertex.size = 5,
     vertex.label.cex = 0.7,
     vertex.label = V(g_country)$name,
     edge.width = E(g_country)$weight / max(E(g_country)$weight) * 5,
     main = "Country Collaboration Network by Shared Disciplines")

Projected one mode country network based on shared Olympic disciplines of all the countries. Projected one mode country network based on shared Olympic disciplines

#count number of disciplines per country
country_discipline_count <- athletes %>%
  distinct(country, disciplines) %>%         
  count(country, name = "num_disciplines") %>%
  filter(num_disciplines > 15)               

#filter original data
filtered_athletes <- athletes %>%
  filter(country %in% country_discipline_count$country)

#create bipartite edge list: country–discipline
country_discipline <- as.matrix(cbind(filtered_athletes$country, filtered_athletes$disciplines))
g_cd <- graph_from_edgelist(country_discipline, directed = FALSE)
V(g_cd)$type <- bipartite_mapping(g_cd)$type

#create biadjacency matrix and project
bipart_cd <- as_biadjacency_matrix(g_cd)
country_mat <- bipart_cd %*% t(bipart_cd)
diag(country_mat) <- 0

#graph
g_country <- graph_from_adjacency_matrix(country_mat, mode = "undirected", weighted = TRUE)
g_country <- delete_edges(g_country, E(g_country)[weight == 0])

top_countries <- country_discipline_count %>%
  arrange(desc(num_disciplines)) %>%
  slice(1:10) %>%
  pull(country)
V(g_country)$color <- ifelse(V(g_country)$name %in% top_countries, "red", "gold")

Projected one mode country network based on shared Olympic disciplines

set.seed(2)
layout_country <- layout_with_fr(g_country)
plot(g_country,
     layout = layout_country,
     vertex.size = 6,
     vertex.label.cex = 0.7,
     vertex.label = V(g_country)$name,
     vertex.color = V(g_country)$color,
     edge.width = E(g_country)$weight / max(E(g_country)$weight) * 5,
     main = "Country Collaboration Network with Top 10 in Red")

Projected one mode country network based on shared Olympic disciplines. Countries that particiated more than 15 disciplines are selected. Countries are connected if their athletes participated in the same disciplines. Top 10 most active countries are highlighted in red and this shows the high connectivity

png("img/top10_country_collaboration_network.png", width = 2000, height = 2000, res = 300)
set.seed(1)
plot(g_country,
     layout = layout_country,
     vertex.size = 6,
     vertex.label.cex = 0.7,
     vertex.label = V(g_country)$name,
     vertex.color = V(g_country)$color,
     edge.width = E(g_country)$weight / max(E(g_country)$weight) * 5,
     main = "Country Collaboration Network with Top 10 in Red")